In [1]:
import pandas as pd
from sklearn import *
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
df = pd.read_csv("/data/creditcard-fraud.csv.gz")
df.head()
Out[2]:
In [3]:
df.info()
In [4]:
df.Class.value_counts()
Out[4]:
In [5]:
df.Class.value_counts()/len(df)
Out[5]:
In [6]:
target = "Class"
y = df[target]
X = df.loc[:, "V1":"V28"]
#X = df.drop(columns=target)
#X = pd.get_dummies(X, drop_first=True)
features = X.columns
#X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y
# , test_size = 0.3, random_state = 1)
training_size = int(0.7 * len(df))
X_train = X.values[:training_size,:]
y_train = y.values[:training_size]
X_test = X.values[training_size:,:]
y_test = y.values[training_size:]
pipe = pipeline.Pipeline([
("poly", preprocessing.PolynomialFeatures(degree = 1
, include_bias=False)),
("scaler", preprocessing.StandardScaler()),
("est", linear_model.LogisticRegression(solver="liblinear"))
])
pipe.fit(X_train, y_train)
y_train_predict = pipe.predict(X_train)
y_test_predict = pipe.predict(X_test)
In [7]:
print(metrics.classification_report(y_test, y_test_predict))
In [8]:
print("confusion metrics: \n", metrics.confusion_matrix(y_test, y_test_predict))
In [9]:
print("TPR", 55/(55+53))
print("FPR", 14/(14+85321))
In [10]:
print("confusion metrics: \n", metrics.confusion_matrix(y_test, y_test_predict))
print("train preicision: ", metrics.precision_score(y_train, y_train_predict))
print("test preicision: ", metrics.precision_score(y_test, y_test_predict))
print("train recall: ", metrics.recall_score(y_train, y_train_predict))
print("test recall: ", metrics.recall_score(y_test, y_test_predict))
print("train f1: ", metrics.f1_score(y_train, y_train_predict))
print("test f1: ", metrics.f1_score(y_test, y_test_predict))
print("train accuracy: ", metrics.accuracy_score(y_train, y_train_predict))
print("test accuracy: ", metrics.accuracy_score(y_test, y_test_predict))
In [11]:
y_test_prob = pipe.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_prob)
plt.plot(fpr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC ")
auc = metrics.auc(fpr, tpr)
plt.title("ROC " + str(auc))
Out[11]:
In [12]:
metrics.roc_auc_score(y_test, y_test_prob)
Out[12]:
Sum of amounts in the TN bucket of the test dataset.
In [13]:
df.Amount.values[training_size:][(y_test == 0) & (y_test_predict == 0)].sum()
Out[13]:
Sum of amounts in the FN bucket of the test dataset.
In [14]:
df.Amount.values[training_size:][(y_test == 1) & (y_test_predict == 0)].sum()
Out[14]:
In [16]:
100 * 8336.05/7224977.58
Out[16]:
In [ ]: